#!/usr/bin/perl
#Programmer: Rory Carmichael
#Purpose: Collect statistics about input files for rcclust

my %col_hist = ();
my $numlines = 0;
my $numpeaks = 0;

my $infile = $ARGV[0];
open(IF, "$infile");
my $firstline = <IF>;
my $secondline = <IF>;
if($secondline =~ /^[0-9|\-|\s]*$/) {
	print STDERR "$infile is badly formatted.\n";
	print STDERR "BAD LINE 2: $secondline\n";
	exit 1;
}
foreach my $line (<IF>) {
	if($line !~ /^[0-9|\-|\s]*$/) {
		print STDERR "$infile is badly formatted.\n";
		print STDERR "BAD LINE " . $numlines + 3 . ": $line\n";
	}
	my @splitline = split(/\s+/,$line);
	my $numcols = 0;
	foreach my $part (@splitline) {
		if ($part !~ /-/) {
			$numcols++;
			if ($part =~ /0/) {
				$numpeaks++;
			}
		} 	
	}
	$col_hist{$numcols}++;
	$numlines++;
}
close(IF);

print "Column Histogram\n";
foreach my $key (sort {$a <=> $b}(keys(%col_hist))) {
	print "$key\t" . $col_hist{$key} . "\n";
}
print "Lines: $numlines\tPeaks: $numpeaks\n";
